Question 1:

First we will introduce our libraries:

library(dplyr)
library(readr)
library(highcharter)
library(plotly)
library(ggplot2)
library(ROCR)
library(grid)
library(scales)
library(gridExtra)
library(tidyr)
library(ggthemes)

Now we will use the plotly package:

earth = readRDS("/Users/shervin/Downloads/week_11/data/historical_web_data_26112015.rds")
plot_ly(earth ,
        x = ~earth$Longitude ,
        y = ~earth$Latitude , 
        z = ~earth$Depth , 
        size = ~earth$Magnitude)
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

Question 2:

disaster = read_delim("/Users/shervin/Downloads/week_11/data/disaster.txt", "\t", escape_double = FALSE, trim_ws = TRUE)
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   FLAG_TSUNAMI = col_character(),
##   SECOND = col_character(),
##   EQ_PRIMARY = col_double(),
##   EQ_MAG_MW = col_double(),
##   EQ_MAG_MS = col_double(),
##   EQ_MAG_MB = col_character(),
##   EQ_MAG_ML = col_double(),
##   EQ_MAG_MFA = col_character(),
##   EQ_MAG_UNK = col_double(),
##   COUNTRY = col_character(),
##   STATE = col_character(),
##   LOCATION_NAME = col_character(),
##   LATITUDE = col_double(),
##   LONGITUDE = col_double(),
##   MISSING = col_character(),
##   DAMAGE_MILLIONS_DOLLARS = col_character(),
##   TOTAL_MISSING = col_character(),
##   TOTAL_MISSING_DESCRIPTION = col_character(),
##   TOTAL_DAMAGE_MILLIONS_DOLLARS = col_character()
## )
## See spec(...) for full column specifications.
disaster %>%  select(tsunami = FLAG_TSUNAMI, lat = LATITUDE, lon = LONGITUDE, name = COUNTRY, sequence =YEAR, z = EQ_MAG_ML, sequence = YEAR  ) %>% select(lat,lon,z,name,sequence) %>% na.omit()->disaster_temp 
 
hcmap() %>% hc_add_series(data = disaster_temp, type = "mapbubble",minSize = 0, maxSize = 30, color = "blue")  %>% hc_plotOptions(series = list(showInLegend = FALSE))

## Question 3:

iran_earthquake = read_rds("/Users/shervin/Downloads/week_11/data/iran_earthquake.rds")
iran_earthquake %>% filter(Mag>3) ->iran_earthquake_temp

plot <- ggplot(data = iran_earthquake_temp, aes(x=Lat, y =Long)) +geom_point()
plot + stat_density_2d(aes(fill = ..level..), geom = "polygon") +xlab("Longitude") + ylab("Latitude")

#we shoud filter it again to get a batter digram:
iran_earthquake_temp %>% filter(Lat<=45, Long<=70) -> iran_earthquake_temp_checked

plot <- ggplot(data = iran_earthquake_temp_checked, aes(x=Lat, y =Long)) +geom_point()
plot + stat_density_2d(aes(fill = ..level..), geom = "polygon") +xlab("Longitude") + ylab("Latitude")

## Question 4:

great_earthquake <- disaster %>% filter(COUNTRY == "IRAN" ,EQ_PRIMARY >= 7) #greatest eartquakes in Iran


#We have to find the cyle of  every earthquake with a magnitude greater than 7:
Year = as.data.frame(great_earthquake %>% select(YEAR) %>% filter(YEAR>0))
Year_shift = c(Year[-1,1],0)
Diff = Year_shift - Year
Diff = Diff[-26,1] # The time between every two earthquake with magnitude greater than 7
print(Diff)
##  [1]    20    93   186   167   127    53    94   125   113    59   110
## [12]    19    20     1    27     5     1     0     5    10     1     2
## [23]     9     7    16 -2017
#Now we will find the exact time difference from the last earthquake:
iran_earthquake %>% filter(Mag>=7) %>% select(OriginTime) %>% arrange(desc(OriginTime)) -> time

From this we can see the last great earthquake was more than two years ago and since the cycle of every earthquake is usually less than this the probability that we’ll be experiencing an earthquake in the near future is high.

Question 5:

disaster %>% select(lat =LATITUDE, lon = LONGITUDE, z =TOTAL_DEATHS, name = COUNTRY ) %>% 
             na.omit() %>%  group_by(name) %>% summarise(mean = mean(z), total = sum(z))-> disaster_average

hcmap(data = disaster_average, value = "total", name = "Average total Deaths by Earthquake",
      dataLabels = list(enabled = TRUE, format = '{point.name}')) %>% 
  hc_title(text = 'Total Deaths of Countries by Earthquake',
           align = 'center') %>% 
  hc_subtitle(text = 'displayed by number of peoples', align = 'center')
hcmap(data = disaster_average, value = "mean", name = "Average total Deaths by Earthquake",
      dataLabels = list(enabled = TRUE, format = '{point.name}')) %>% 
  hc_title(text = 'Average total Death of Countries by Earthquake',
           align = 'center') %>% 
  hc_subtitle(text = 'displayed by number of peoples', align = 'center')

Question 6:

We can make a generalised linear model (glm) using the given columns of Lattitude and Longitude. A fair guess is to take the family of this glm to be poisson. We can check this assumption by plotting the density of the deaths:

disaster %>% select(lat = )
## # A tibble: 6,026 x 0
model = glm(data = disaster, DEATHS ~ LATITUDE + LONGITUDE + FOCAL_DEPTH + EQ_PRIMARY,
                    family = poisson(link = "log"))
summary(model)
## 
## Call:
## glm(formula = DEATHS ~ LATITUDE + LONGITUDE + FOCAL_DEPTH + EQ_PRIMARY, 
##     family = poisson(link = "log"), data = disaster)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -465.08   -56.07   -35.51   -17.72  1520.67  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.641e+00  5.209e-03 -315.01   <2e-16 ***
## LATITUDE     2.776e-02  3.245e-05  855.56   <2e-16 ***
## LONGITUDE    2.751e-04  6.960e-06   39.52   <2e-16 ***
## FOCAL_DEPTH -2.904e-02  4.618e-05 -628.84   <2e-16 ***
## EQ_PRIMARY   1.363e+00  7.126e-04 1912.56   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 16778240  on 1182  degrees of freedom
## Residual deviance: 12197978  on 1178  degrees of freedom
##   (4843 observations deleted due to missingness)
## AIC: 12203656
## 
## Number of Fisher Scoring iterations: 8
disaster %>% select(DEATHS) %>% na.omit() %>% as.matrix() -> disaster_deaths
hchart(density(disaster_deaths)) %>% 
  hc_title(text = 'Density Distribution of death') %>% 
  hc_xAxis(title = list(text = "Number of Deaths")) %>% 
  hc_yAxis(title = list(text = "Probability Density"))

Which we can see is actually a good assumption.

Question 7:

We have to make a glm prediction model and make a prediction based on this model.

# We first have to make the longitude and latitude a round number:
worldwide = read.csv("/Users/shervin/Downloads/week_11/data/worldwide.csv")
worldwide %>% filter(type=="earthquake") %>%  select(time,latitude,longitude,mag) -> worldwide_temp
worldwide_temp %>% mutate(lat = round(latitude),
                          lon = round(longitude)) %>%         
                          select(time,lat,lon,mag) %>% 
                          na.omit()-> worldwide_temp
worldwide_temp$time = as.Date(worldwide_temp$time)
worldwide_temp %>% mutate(Month = (12*as.integer(format(time,"%y"))+as.integer(format(time,"%m")))) ->worldwide_temp

#We define Pish larze as a earthquake with magnitude 4 or less:
worldwide_temp %>% group_by(lat,lon,Month) %>% summarise(max_e = max(mag), count = n(), average= sum(mag)/count) %>% 
                                               mutate(Larzeh = as.integer(max_e>4)) %>% na.omit() ->worldwide_temp_larzeh
worldwide_temp_larzeh$Larzeh = as.factor(worldwide_temp_larzeh$Larzeh)

test = worldwide_temp_larzeh %>% sample_frac(0.2)
train = anti_join(worldwide_temp_larzeh,test)
## Joining, by = c("lat", "lon", "Month", "max_e", "count", "average", "Larzeh")
model = glm(data = train,
             formula = Larzeh ~ count,
             family = binomial(link = 'logit'))
predict = predict(model, data = test,
                        type = "response")

Question 8:

We can either user the correlation tests to first check wheter these two are correlated or not. If they were correlated with a high degree we have to check whether we fall into the correlation is causation trap. Therefore we have to use another test to check this assumption. We can use chi-square as a test which can check whether these two come from the same distrubtion, if this test also agrees, we may intrept that they affect each other.

worldwide = read.csv("/Users/shervin/Downloads/week_11/data/worldwide.csv")

worldwide %>% select(depth , mag) %>% na.omit() -> worldwide.selected
q = cor.test(worldwide.selected$depth, worldwide.selected$mag,
         alternative = c("two.sided"), method = c("spearman"))
q
## 
##  Spearman's rank correlation rho
## 
## data:  worldwide.selected$depth and worldwide.selected$mag
## S = 2.8439e+13, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.2344252
chisq.test(worldwide$mag , worldwide$depth)
## 
##  Pearson's Chi-squared test
## 
## data:  worldwide$mag and worldwide$depth
## X-squared = 3403700, df = 4463700, p-value = 1

The first test confirms that there exists a week correlation between these two ( because p-value lower than our cut-off). So we can conclude that these two are not much related.

Question 9:

We will first find the average number of earthquakes after that to see whether HAARP was real or not We need to make a t-test ,make a null-hypothesis and check whether this assumption can be rejected or not. The saying goes that the HAARP was intended to activate japanese earthquakes, so we can make our null-hypothesis that the after HAARP and before HAARP the number of earthquakes are not different.

worldwide = read_csv("/Users/shervin/Downloads/week_11/data/worldwide.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   time = col_datetime(format = ""),
##   magType = col_character(),
##   nst = col_integer(),
##   net = col_character(),
##   id = col_character(),
##   updated = col_datetime(format = ""),
##   place = col_character(),
##   type = col_character(),
##   magNst = col_integer(),
##   status = col_character(),
##   locationSource = col_character(),
##   magSource = col_character()
## )
## See spec(...) for full column specifications.
worldwide %>% filter(type =="earthquake") %>% select(longitude,latitude,time) %>% 
              mutate(year = as.numeric(format(time, "%Y"))) %>%  group_by(longitude,latitude,year) %>% summarise(count = n()) -> worldwide_earthquakes_count

print(worldwide_earthquakes_count)
## # A tibble: 60,105 x 4
## # Groups:   longitude, latitude [?]
##    longitude latitude  year count
##        <dbl>    <dbl> <dbl> <int>
##  1      -180    -24.4  2016     1
##  2      -180    -23.5  2015     1
##  3      -180    -37.2  2016     1
##  4      -180    -24.8  2017     1
##  5      -180    -37.0  2016     1
##  6      -180    -23.8  2016     1
##  7      -180    -24.9  2016     1
##  8      -180    -18.3  2017     1
##  9      -180    -24.0  2017     1
## 10      -180     51.1  2018     1
## # ... with 60,095 more rows
worldwide %>% filter(type =="earthquake") %>% select(place,time) %>% 
              mutate(year = as.numeric(format(time, "%Y"))) %>%  group_by(place,year) %>% summarise(count = n()) -> worldwide_earthquakes_count_place

print(worldwide_earthquakes_count_place)
## # A tibble: 43,523 x 3
## # Groups:   place [?]
##    place                                         year count
##    <chr>                                        <dbl> <int>
##  1 0km E of Loiza, Puerto Rico                   2016     1
##  2 0km E of Monte Grande, Puerto Rico            2017     1
##  3 0km E of San Andres, Philippines              2017     1
##  4 0km E of San Ramon, California                2016     1
##  5 0km E of The Geysers, California              2017     1
##  6 0km ENE of Balangiga, Philippines             2018     1
##  7 0km ENE of Candabong, Philippines             2017     1
##  8 0km ENE of Estancias de Florida, Puerto Rico  2016     1
##  9 0km ENE of Las Ollas, Puerto Rico             2016     1
## 10 0km ENE of Talisayan, Philippines             2017     1
## # ... with 43,513 more rows
#Haarp:
disaster %>% filter( COUNTRY =="JAPAN", YEAR >=1950) %>% select(YEAR,DEATHS) %>% arrange(desc(YEAR))  %>% na.omit() -> disaster_japan

Haarp_before <- disaster_japan %>% filter( YEAR >1950 , YEAR <1993) %>%  select(DEATHS) %>% as.matrix()
Haarp_after <- disaster_japan %>% filter(YEAR > 1993) %>% select(DEATHS) %>% as.matrix()
t.test(Haarp_before,Haarp_after)
## 
##  Welch Two Sample t-test
## 
## data:  Haarp_before and Haarp_after
## t = -1.2265, df = 14.006, p-value = 0.2402
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1254.7459   341.7459
## sample estimates:
## mean of x mean of y 
##      17.5     474.0

The t-test rejects the assumption that HAARP had anything to do with the total deaths due to earthquake in japan therefore we can say that HAARP is not really true.

Question 10:

  1. What is the sum and average of the damage of earthquake in every country?
disaster %>% select(lat =LATITUDE, lon = LONGITUDE, z =DAMAGE_MILLIONS_DOLLARS, name = COUNTRY ) %>% 
             na.omit() %>%  group_by(name) %>% summarise(mean = mean(as.integer(z)), total = sum(as.integer(z))) %>% select(name,mean,total)-> disaster_average

hcmap(data = disaster_average, value = "total", name = "Average total Deaths by Earthquake",
      dataLabels = list(enabled = TRUE, format = '{point.name}')) %>% 
  hc_title(text = 'Total Deaths of Countries by Earthquake',
           align = 'center') %>% 
  hc_subtitle(text = 'displayed by number of peoples', align = 'center')
hcmap(data = disaster_average, value = "mean", name = "Average total Deaths by Earthquake",
      dataLabels = list(enabled = TRUE, format = '{point.name}')) %>% 
  hc_title(text = 'Total Deaths of Countries by Earthquake',
           align = 'center') %>% 
  hc_subtitle(text = 'displayed by number of peoples', align = 'center')
  1. Points on earth which have the most number of earthquaks
worldwide %>% filter(type=="earthquake") %>%  select(time,latitude,longitude,mag) ->worldwide_temp
worldwide_temp %>% mutate(lat = round(latitude),
                          lon = round(longitude)) ->worldwide_temp

 worldwide_temp %>%  group_by(lat,lon) %>%
                     summarise(z = n()) %>% 
                     arrange(desc(z))-> worldwide_temp

hcmap() %>% hc_add_series(data = worldwide_temp[1:20,], type = "mapbubble",minSize = 0, maxSize = 30, color = "blue")  %>% hc_plotOptions(series = list(showInLegend = FALSE))

3) Is depth of earthquak corrleated to magnitude?

worldwide %>% filter(type=="earthquake") %>%  select(time,latitude,longitude,mag) ->worldwide_temp
worldwide %>% mutate( deep = as.integer(log(depth)>2), high_mag = as.integer(mag>5)) %>% select(deep,high_mag) -> worldwide_depth
t.test(worldwide_depth$deep~worldwide_depth$high_mag)
## 
##  Welch Two Sample t-test
## 
## data:  worldwide_depth$deep by worldwide_depth$high_mag
## t = -33.508, df = 4432.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1201130 -0.1068347
## sample estimates:
## mean in group 0 mean in group 1 
##       0.8580998       0.9715736

We see that our null-hypothesis is wrong and depth affects magnitude we can find the correlation with spearman:

cor.test(worldwide_depth$deep,worldwide_depth$high_mag,
         alternative = c("two.sided"), method = c("spearman"))
## Warning in cor.test.default(worldwide_depth$deep, worldwide_depth
## $high_mag, : Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  worldwide_depth$deep and worldwide_depth$high_mag
## S = 3.416e+13, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## 0.07131091

Which states that these two are corrleated.